{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#导入必要的工具包\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "from matplotlib import pyplot\n",
    "import seaborn as sns\n",
    "%matplotlib inline\n",
    "\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "dpath = './data/'\n",
    "data = pd.read_csv(dpath + \"training.csv\")\n",
    "data.columns = [\"target\",\"content\"]#将数据分为两个属性\n",
    "\n",
    "data[\"target\"] = data[\"target\"]. map(lambda s: int(s) - 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征探索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 4773 entries, 0 to 4772\n",
      "Data columns (total 2 columns):\n",
      "target     4773 non-null int64\n",
      "content    4773 non-null object\n",
      "dtypes: int64(1), object(1)\n",
      "memory usage: 74.7+ KB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>4773.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>4.070186</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.286824</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>3.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>10.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            target\n",
       "count  4773.000000\n",
       "mean      4.070186\n",
       "std       2.286824\n",
       "min       0.000000\n",
       "25%       2.000000\n",
       "50%       3.000000\n",
       "75%       5.000000\n",
       "max      10.000000"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEKCAYAAAAFJbKyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAGHRJREFUeJzt3X20XXV95/H3hwAK+BAeIisSMGgzKqUupRkKMoNUrIIiwQdmpIopxUanaFG6loDjFB37oNanOlYsFSyuKgwGNZGiQhFwOh0ewqNBsERAiCCJgoCgAvKdP/a+zTHc3Jyde88593Lfr7XuOnv/9j7n+zsrcD9379/ev52qQpKkfm016g5IkmYWg0OS1InBIUnqxOCQJHVicEiSOjE4JEmdGBySpE4MDklSJwaHJKmTrUfdgUHYZZddauHChaPuhiTNKFddddWPq2re5vZ7QgbHwoULWbVq1ai7IUkzSpIf9LOfp6okSZ0YHJKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ08Ie8cn+3OO+PQgdc47A+/Pm77x7/4ioHWfdfvf3Ogny9p8zzikCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktTJwIIjyRlJ1iVZ3dP210luSnJ9kq8kmduz7eQka5J8L8kretoPadvWJDlpUP2VJPVnkEcc/wAcslHbhcDeVfUC4N+AkwGS7AW8AfjN9j2fTjInyRzgb4FDgb2Ao9p9JUkjMrDgqKpvA/ds1HZBVT3arl4GLGiXlwBnV9Uvq+pWYA2wb/uzpqpuqaqHgbPbfSVJIzLKMY4/BMbmrdgNuKNn29q2bVPtj5NkWZJVSVatX79+AN2VJMGIgiPJfwceBb4w1jTObjVB++Mbq06rqsVVtXjevHlT01FJ0uMMfZLDJEuBw4CDq2osBNYCu/fstgC4s13eVLskaQSGesSR5BDgRODwqnqoZ9NK4A1JnpRkT2ARcAVwJbAoyZ5JtqUZQF85zD5Lkn7dwI44kpwFHATskmQtcArNVVRPAi5MAnBZVb2tqm5Icg7wXZpTWMdV1a/az3k78E1gDnBGVd0wqD5LkjZvYMFRVUeN03z6BPv/BfAX47SfD5w/hV2TJE2Cd45LkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUicGhySpE4NDktSJwSFJ6mRgwZHkjCTrkqzuadspyYVJbm5fd2zbk+STSdYkuT7JPj3vWdruf3OSpYPqrySpP4M84vgH4JCN2k4CLqqqRcBF7TrAocCi9mcZcCo0QQOcAvwOsC9wyljYSJJGY2DBUVXfBu7ZqHkJcGa7fCZwRE/756txGTA3yXzgFcCFVXVPVd0LXMjjw0iSNETDHuPYtaruAmhfn9G27wbc0bPf2rZtU+2SpBGZLoPjGaetJmh//Acky5KsSrJq/fr1U9o5SdIGww6Ou9tTULSv69r2tcDuPfstAO6coP1xquq0qlpcVYvnzZs35R2XJDWGHRwrgbEro5YCK3ra39xeXbUfcF97KuubwMuT7NgOir+8bZMkjcjWg/rgJGcBBwG7JFlLc3XUB4FzkhwL3A4c2e5+PvBKYA3wEHAMQFXdk+QDwJXtfv+zqjYecJckDdHAgqOqjtrEpoPH2beA4zbxOWcAZ0xh1yRJkzBdBsclSTOEwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRODA5JUiebDY4ke/bTJkmaHfo54jh3nLblU90RSdLMsMk7x5M8D/hN4OlJXtuz6WnAkwfdMUnS9DTRlCPPBQ4D5gKv7ml/APijQXZKkjR9bTI4qmoFsCLJ/lX1/4bYJ0nSNNbPGMdPklyUZDVAkhckee+A+yVJmqb6CY6/B04GHgGoquuBNwyyU5Kk6auf4Ni+qq7YqO3RQXRGkjT99RMcP07yHNpnfSd5PXDXQHslSZq2+nmQ03HAacDzkvwQuBV400B7JUmatjYbHFV1C/CyJDsAW1XVA4PvliRputpscCQ5YaN1gPuAq6rq2gH1S5I0TfUzxrEYeBuwW/uzDDgI+Psk7x5c1yRJ01E/Yxw7A/tU1c8AkpxCM1fVgcBVwIcH1z1J0nTTzxHHHsDDPeuPAM+qqp8DvxxIryRJ01Y/RxxfBC5LsqJdfzVwVjtY/t2B9UySNC31c1XVB5KcD/wnIMDbqmpVu/mNg+ycJGn6mTA4kmwFXF9Ve9OMZ0yJJO8C3kJzU+F3gGOA+cDZwE7A1cDRVfVwkicBnwd+G/gJ8F+r6rap6oskqZsJxziq6jHguiR7TFXBJLsBfwIsbgNpDs3cVx8CPl5Vi4B7gWPbtxwL3FtVvwF8vN1PkjQi/YxxzAduSHIF8OBYY1UdPsm62yV5BNieZgqTlwK/324/E3gfcCqwpF2G5mquTyVJVdUk6ktT5lXn/t1AP/+fXvfWgX6+1FU/wfH+qSxYVT9M8hHgduDnwAU0p8F+WlVjkyeupblnhPb1jva9jya5j+YS4R/3fm6SZTT3mLDHHlN2gCRJ2kg/g+OXTmXBJDvSHEXsCfwU+BJw6Hilx94ywbYNDVWn0cypxeLFiz0akaQB2ex9HEn2S3Jlkp8leTjJr5LcP4maLwNurar1VfUI8GXgxcDcJGNBtgC4s11eC+ze9mVr4OnAPZOoL0mahH5uAPwUcBRwM7AdzdVQn5pEzduB/ZJsn2biq4Np7ge5GHh9u89SYOy+kZXtOu32bzm+IUmj009wUFVrgDlV9auq+hzNXFVbpKoupxnkvprmUtytaE4xnQickGQNzRjG6e1bTgd2bttPAE7a0tqSpMnrZ3D8oSTbAtcm+TDNFVA7TKZoVZ0CnLJR8y3AvuPs+wvgyMnUkyRNnX6OOI5u93s7zeW4uwOvHWSnJEnTVz/BcURV/aKq7q+q91fVCcBhg+6YJGl66ic4lo7T9gdT3A9J0gyxyTGOJEfR3Mm9Z5KVPZueSjNnlCRpFppocPxfaQbCdwE+2tP+AHD9IDslSZq+NhkcVfUD4AfA/sPrjiRpuuvrPg5JksYYHJKkTjYZHEkual99/oUk6d9NNDg+P8lLgMOTnM1Gs9RW1dUD7ZkkaVqaKDj+jGZeqAXAxzbaVjQPXpIkzTITXVW1HFie5H9U1QeG2CdJ0jTWz4OcPpDkcODAtumSqjpvsN2SJE1X/TzI6a+A42memfFd4Pi2TZI0C/UzrfqrgBdW1WMASc4ErgFOHmTHJEnTU7/3ccztWX76IDoiSZoZ+jni+CvgmiQX01ySeyAebUjSrNXP4PhZSS4B/iNNcJxYVT8adMckSdNTP0ccVNVdwMrN7ihJesJzripJUicGhySpkwmDI8lWSVYPqzOSpOlvwuBo7924LskeQ+qPJGma6+dU1XzghiQXJVk59jOZoknmJlme5KYkNybZP8lOSS5McnP7umO7b5J8MsmaJNcn2WcytSVJk9PPVVXvH0DdvwG+UVWvT7ItsD3wHuCiqvpgkpNoZuY9ETgUWNT+/A5wavsqSRqBzR5xVNWlwG3ANu3ylcAWP4sjydNobiI8vf38h6vqp8AS4Mx2tzOBI9rlJcDnq3EZMDfJ/C2tL0manH4mOfwjYDnwd23TbsBXJ1Hz2cB64HNJrkny2SQ7ALu294uM3TfyjJ56d/S8f23bJkkagX7GOI4DDgDuB6iqm9nwS31LbA3sA5xaVS8CHqQ5LbUpGaetHrdTsizJqiSr1q9fP4nuSZIm0k9w/LKqHh5bSbI14/zi7mAtsLaqLm/Xl9MEyd1jp6Da13U9++/e8/4FwJ0bf2hVnVZVi6tq8bx58ybRPUnSRPoJjkuTvAfYLsnvAV8CvralBdt5ru5I8ty26WCa53ysBJa2bUuBFe3ySuDN7dVV+wH3jZ3SkiQNXz9XVZ0EHAt8B3grcD7w2UnWfQfwhfaKqluAY2hC7JwkxwK3A0e2+54PvBJYAzzU7itJGpF+Zsd9rH140+U0p6i+V1WTOVVFVV0LLB5n08Hj7Fs04yySpGlgs8GR5FXAZ4Dv0wxU75nkrVX19UF3TpI0/fRzquqjwO9W1RqAJM8B/gkwOCRpFupncHzdWGi0bmHDFU+SpFlmk0ccSV7bLt6Q5HzgHJoxjiNp7h6XJM1CE52qenXP8t3AS9rl9cCOA+uRJGla22RwVJWXvUqSHqefq6r2pLnvYmHv/lV1+OC6JUmarvq5quqrNDPZfg14bLDdkSRNd/0Exy+q6pMD74kkaUboJzj+JskpwAXAL8caq2qLn8khSZq5+gmO3wKOBl7KhlNV1a5LkmaZfoLjNcCze6dWlyTNXv3cOX4dMHfQHZEkzQz9HHHsCtyU5Ep+fYzDy3ElaRbqJzhOGXgvJEkzRj/P47h0GB2RJM0M/dw5/gAbnjG+LbAN8GBVPW2QHZMkTU/9HHE8tXc9yRHAvgPrkSRpWuvnqqpfU1VfxXs4JGnW6udU1Wt7VreieVb4pJ45Lkmaufq5qqr3uRyPArcBSwbSG0nStNfPGIfP5ZAk/buJHh37ZxO8r6rqAwPojyRpmpvoiOPBcdp2AI4FdgYMDkmahTZ5VVVVfXTsBzgN2A44BjgbePZkCyeZk+SaJOe163smuTzJzUn+d5Jt2/Yntetr2u0LJ1tbkrTlJrwcN8lOSf4cuJ7m6GSfqjqxqtZNQe3jgRt71j8EfLyqFgH30hzZ0L7eW1W/AXy83U+SNCKbDI4kfw1cCTwA/FZVva+q7p2KokkWAK8CPtuuh+bekOXtLmcCR7TLS9p12u0Ht/tLkkZgoiOOPwWeCbwXuDPJ/e3PA0nun2TdTwDvZsODoXYGflpVj7bra4Hd2uXdgDsA2u33tftLkkZgk4PjVdX5rvJ+JDkMWFdVVyU5aKx5vC70sa33c5cBywD22GOPKeipJGk8AwmHzTgAODzJbTQD7S+lOQKZm2QsyBYAd7bLa4HdAdrtTwfu2fhDq+q0qlpcVYvnzZs32G8gSbPY0IOjqk6uqgVVtRB4A/CtqnojcDHw+na3pcCKdnllu067/VtV5ZQnkjQiozji2JQTgROSrKEZwzi9bT8d2LltPwE4aUT9kyTR31xVA1NVlwCXtMu3MM507VX1C+DIoXZMkrRJ0+mIQ5I0AxgckqRODA5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnRgckqRORjrliKQtd/jyFZvfaRJWvn7JQD9fM5dHHJKkTgwOSVInBockqRODQ5LUiYPjkrQZN3367oHXeN4f7zrwGlPFIw5JUicGhySpE4NDktSJwSFJ6sTgkCR1YnBIkjoxOCRJnXgfh54QDl3xtoHX+PqSzwy8hjQTDP2II8nuSS5OcmOSG5Ic37bvlOTCJDe3rzu27UnyySRrklyfZJ9h91mStMEoTlU9CvxpVT0f2A84LslewEnARVW1CLioXQc4FFjU/iwDTh1+lyVJY4Z+qqqq7gLuapcfSHIjsBuwBDio3e1M4BLgxLb981VVwGVJ5iaZ336OpBE48tzVA/38L71u74F+viZnpIPjSRYCLwIuB3YdC4P29RntbrsBd/S8bW3btvFnLUuyKsmq9evXD7LbkjSrjSw4kjwFOBd4Z1XdP9Gu47TV4xqqTquqxVW1eN68eVPVTUnSRkYSHEm2oQmNL1TVl9vmu5PMb7fPB9a17WuB3XvevgC4c1h9lST9ulFcVRXgdODGqvpYz6aVwNJ2eSmwoqf9ze3VVfsB9zm+IUmjM4r7OA4Ajga+k+Tatu09wAeBc5IcC9wOHNluOx94JbAGeAg4ZrjdlST1GsVVVf/C+OMWAAePs38Bxw20U5KkvnnnuKQZ45xzfzzQz/8vr9tloJ//ROFcVZKkTgwOSVInBockqRODQ5LUicEhSerE4JAkdWJwSJI6MTgkSZ0YHJKkTgwOSVInTjkyQN//X0sG+vnPeceKze8kaUa7+xNXDPTzd33nvp3f4xGHJKkTg0OS1InBIUnqxOCQJHVicEiSOjE4JEmdPOEvx11/6j8OvMa8//amgdeQpOnCIw5JUicGhySpE4NDktSJwSFJ6mTGBEeSQ5J8L8maJCeNuj+SNFvNiOBIMgf4W+BQYC/gqCR7jbZXkjQ7zYjgAPYF1lTVLVX1MHA2MNipZyVJ45opwbEbcEfP+tq2TZI0ZKmqUfdhs5IcCbyiqt7Srh8N7FtV7+jZZxmwrF19LvC9SZTcBfjxJN4/0+qOsvZsqzvK2n7n2VF7MnWfVVXzNrfTTLlzfC2we8/6AuDO3h2q6jTgtKkolmRVVS2eis+aCXVHWXu21R1lbb/z7Kg9jLoz5VTVlcCiJHsm2RZ4A7ByxH2SpFlpRhxxVNWjSd4OfBOYA5xRVTeMuFuSNCvNiOAAqKrzgfOHVG5KTnnNoLqjrD3b6o6ytt95dtQeeN0ZMTguSZo+ZsoYhyRpmjA4eoxqWpMkZyRZl2T1sGq2dXdPcnGSG5PckOT4IdZ+cpIrklzX1n7/sGq39eckuSbJeUOseVuS7yS5NsmqYdVta89NsjzJTe2/9/5DqPnc9ruO/dyf5J2DrttT/13tf1urk5yV5MlDqnt8W/OGQX/f8X53JNkpyYVJbm5fd5zywlXlT3O6bg7wfeDZwLbAdcBeQ6p9ILAPsHrI33k+sE+7/FTg34b4nQM8pV3eBrgc2G+I3/0E4IvAeUOseRuwyzD/jXtqnwm8pV3eFpg75PpzgB/R3CcwjHq7AbcC27Xr5wB/MIS6ewOrge1pxpD/GVg0wHqP+90BfBg4qV0+CfjQVNf1iGODkU1rUlXfBu4ZRq2N6t5VVVe3yw8ANzKkO/Kr8bN2dZv2ZygDbkkWAK8CPjuMeqOW5Gk0v2BOB6iqh6vqp0PuxsHA96vqB0OsuTWwXZKtaX6R37mZ/afC84HLquqhqnoUuBR4zaCKbeJ3xxKaPxRoX4+Y6roGxwazelqTJAuBF9H85T+smnOSXAusAy6sqmHV/gTwbuCxIdUbU8AFSa5qZzoYlmcD64HPtafnPptkhyHWh+beq7OGVayqfgh8BLgduAu4r6ouGELp1cCBSXZOsj3wSn795uVh2LWq7oLmj0PgGVNdwODYIOO0zYpLzpI8BTgXeGdV3T+sulX1q6p6Ic1MAPsm2XvQNZMcBqyrqqsGXWscB1TVPjSzPB+X5MAh1d2a5nTGqVX1IuBBmlMYQ9HetHs48KUh1tyR5i/vPYFnAjskedOg61bVjcCHgAuBb9Cc8n500HWHzeDYYLPTmjwRJdmGJjS+UFVfHkUf2tMmlwCHDKHcAcDhSW6jOR350iT/OIS6VNWd7es64Cs0p0eHYS2wtueIbjlNkAzLocDVVXX3EGu+DLi1qtZX1SPAl4EXD6NwVZ1eVftU1YE0p5FuHkbdHncnmQ/Qvq6b6gIGxwazblqTJKE5731jVX1syLXnJZnbLm9H8z/6TYOuW1UnV9WCqlpI82/8raoa+F+iSXZI8tSxZeDlNKc1Bq6qfgTckeS5bdPBwHeHUbt1FEM8TdW6Hdgvyfbtf+cH04zhDVySZ7SvewCvZfjffSWwtF1eCqyY6gIz5s7xQasRTmuS5CzgIGCXJGuBU6rq9CGUPgA4GvhOO9YA8J5q7tIftPnAme1DurYCzqmqoV0aOwK7Al9pfoexNfDFqvrGEOu/A/hC+0fRLcAxwyjanuf/PeCtw6g3pqouT7IcuJrmVNE1DO9O7nOT7Aw8AhxXVfcOqtB4vzuADwLnJDmWJkCPnPK67SVbkiT1xVNVkqRODA5JUicGhySpE4NDktSJwSFJ6sTgkLZAO9vsHw+hzkFJhnLjmtQvg0PaMnOBvoMjjS35/+0ghnTHs9Qv7+OQtkCSsdmTvwdcDLwA2JFmlt/3VtWKduLIr7fb96eZpfRlwIk009ncDPyyqt6eZB7wGWCPtsQ7gR8ClwG/opmk8B1V9X+G8f2kiRgc0hZoQ+G8qtp7bNruqro/yS40v+wXAc+iuUv7xVV1WZJnAv9KM0/UA8C3gOva4Pgi8Omq+pd2qopvVtXzk7wP+FlVfWTY31HaFKcckSYvwF+2s90+RjMd/67tth9U1WXt8r7ApVV1D0CSLwH/od32MmCvdkoSgKeNzW0lTTcGhzR5bwTmAb9dVY+0M++OPab0wZ79xpu6f8xWwP5V9fPexp4gkaYNB8elLfMAzeN2AZ5O84yPR5L8Ls0pqvFcAbwkyY7t6a3X9Wy7AHj72EqSF45TR5oWDA5pC1TVT4D/m2Q18EJgcZJVNEcf404P3z6V7i9pnrL4zzRTm9/Xbv6T9jOuT/Jd4G1t+9eA1yS5Nsl/HtgXkjpwcFwaoiRPqaqftUccX6GZvv8ro+6X1IVHHNJwva999slq4FbgqyPuj9SZRxySpE484pAkdWJwSJI6MTgkSZ0YHJKkTgwOSVInBockqZP/D3SC51l4SV7ZAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.countplot(data[\"target\"]);\n",
    "pyplot.xlabel('target');\n",
    "pyplot.ylabel('Number of target');"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "由图中我们可以看出，类别的分类很不均衡，集中在第2、3、4、5、6类"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 对数据做jieba分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import jieba\n",
    "\n",
    "stopWords = \"的为是，。一和或（）、\"\n",
    "stopWords=list(stopWords)\n",
    "# 把文本分词并去除停用词，返回数组\n",
    "def wordsCut(words):\n",
    "    result = jieba.cut(words)\n",
    "    newWords = []\n",
    "    for s in result:\n",
    "        if s not in stopWords:\n",
    "            newWords.append(s)\n",
    "    return ' '.join(newWords) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\rencw\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.707 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "data[\"content\"] = data[\"content\"].apply(lambda x: wordsCut(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>target</th>\n",
       "      <th>content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>公司 主营业务 向 中小 微 企业 个体 工商户 农户 等 客户 提供 贷款 服务 自 设立...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>公司 立足于 商业地产 服务 致力于 商业地产 开发 销售 运营 全 产业链 提供 一整套 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>公司 经 工商管理 部门 核准 经营范围 “ 投资 咨询 经济 信息 咨询 企业 管理 咨询...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>该 公司 主营业务 在 中国 境内 ( 港 澳 台 除外 ) 开展 保险代理 销售 依托 于...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>公司 主营业务 地铁 商业 物业 租赁 与 运营 管理 服务 公司 以 整体 租赁 方式 取...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   target                                            content\n",
       "0       1  公司 主营业务 向 中小 微 企业 个体 工商户 农户 等 客户 提供 贷款 服务 自 设立...\n",
       "1       0  公司 立足于 商业地产 服务 致力于 商业地产 开发 销售 运营 全 产业链 提供 一整套 ...\n",
       "2       1  公司 经 工商管理 部门 核准 经营范围 “ 投资 咨询 经济 信息 咨询 企业 管理 咨询...\n",
       "3       1  该 公司 主营业务 在 中国 境内 ( 港 澳 台 除外 ) 开展 保险代理 销售 依托 于...\n",
       "4       0  公司 主营业务 地铁 商业 物业 租赁 与 运营 管理 服务 公司 以 整体 租赁 方式 取..."
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  采用TFIDFvectorizer进行特征处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer\n",
    "cv = TfidfVectorizer()\n",
    "cv_fit=cv.fit_transform(data[\"content\"])\n",
    "term2id_dict = cv.vocabulary_\n",
    "x=cv_fit.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4773, 30732)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4773,)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y=data[\"content\"]\n",
    "y.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## KMeans聚类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 一个参数点（聚类数据为K）的模型\n",
    "def K_cluster_analysis(K, X):\n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    y_pred = mb_kmeans.fit_predict(X)\n",
    "    \n",
    "    # K值的评估标准\n",
    "    #本案例中训练数据有标签，可采用有参考模型的评价指标\n",
    "    #v_score = metrics.v_measure_score(y_val, y_val_pred)\n",
    "    \n",
    "    #亦可采用无参考默的评价指标：轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    CH_score = metrics.calinski_harabaz_score(X, y_pred)\n",
    "    \n",
    "    #轮廓系数Silhouette Coefficient在大样本时计算太慢\n",
    "    #si_score = metrics.silhouette_score(X, y_pred)\n",
    "    \n",
    "    print(\"CH_score: {}\".format(CH_score))\n",
    "    #print(\"si_score: {}\".format(si_score))\n",
    "    \n",
    "    return CH_score#,si_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 5\n",
      "CH_score: 30.39994891925296\n",
      "K-means begin with clusters: 10\n",
      "CH_score: 19.64103661214555\n",
      "K-means begin with clusters: 15\n",
      "CH_score: 15.670902447618701\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 14.015086808967482\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 9.730083373137838\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 7.326573968496655\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 6.028285175431658\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [5, 10, 15, 20, 30,40,50]\n",
    "CH_scores = []\n",
    "#si_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K,x)\n",
    "    CH_scores.append(ch)\n",
    "    #si_scores.append(si)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD8CAYAAABn919SAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAG2hJREFUeJzt3XmQVeWZx/HvA7Rsgog0S6AVEQQZxEY7DBRGtiCCRmEMiSEqas+YqZhyiRM1jhlcJolJ3GZSjhWirEMQ4oYLMraIC44ytkQRRETAla0JEhYVQvPOH8/tdDc0vd57z73n/j5Vt7rv6dPex1PVP956VwshICIi2a9Z1AWIiEhyKNBFRGJCgS4iEhMKdBGRmFCgi4jEhAJdRCQmFOgiIjGhQBcRiQkFuohITLRI54d16tQp9OzZM50fKSKS9d58883tIYT8uu5La6D37NmT0tLSdH6kiEjWM7OP6nOfulxERGJCgS4iEhMKdBGRmFCgi4jEhAJdRCQmFOgiIjGhQBcRiYmsCPSSErjzzqirEBHJbHUGupm1MrP/M7O3zWy1md2WuH6imS03s3VmNt/MjkpVkSUlcMstsGVLqj5BRCT71aeFvg8YFUI4DSgEzjGzIcCvgHtDCH2Az4HiVBV5xRVQXg6zZ6fqE0REsl+dgR7cnsTbvMQrAKOARxLXZwETUlIh0K8fDBsGDz0EIaTqU0REslu9+tDNrLmZvQVsA0qA9cDOEMKBxC2fAt2P8LtXmlmpmZWWlZU1utB//Ed4/3149dVG/ydERGKtXoEeQigPIRQCPYDBwCk13XaE350WQigKIRTl59e5WdgRTZoE7dp5K11ERA7XoFkuIYSdwIvAEKCDmVXs1tgD2JTc0qpr2xYuuggWLIBdu1L5SSIi2ak+s1zyzaxD4vvWwDeBNcBS4NuJ26YAC1NVZIXiYvjiC3j44VR/kohI9qlPC70bsNTMVgJvACUhhKeBG4Efm9kHwHFAyjtDBg+Gv/s7dbuIiNSkzgMuQggrgUE1XN+A96enjZkPjl53HaxaBQMGpPPTRUQyW1asFK3q4oshL0+tdBGRQ2VdoHfqBBMm+CKjffuirkZEJHNkXaCDD47u2AELUz4MKyKSPbIy0L/5TSgoULeLiEhVWRnozZv7/i4lJfBRvc7CFhGJv6wMdIDLL/evM2ZEW4eISKbI2kA/4QTvepkxw3diFBHJdVkb6OCDox9/DEuWRF2JiEj0sjrQJ0yAjh01OCoiAlke6C1bwiWXwBNPwPbtUVcjIhKtrA508G6X/fvhv/876kpERKKV9YF+6qnw9a/rNCMRkawPdPBW+qpV8MYbUVciIhKdWAT6RRdB69YaHBWR3BaLQD/mGPjOd2DePNi7N+pqRESiEYtAB+922b0b/vjHqCsREYlGbAL9zDPh5JPV7SIiuSs2gW7mG3YtWwZr10ZdjYhI+sUm0AGmTPGdGKdPj7oSEZH0i1Wgd+0K550Hs2bBX/8adTUiIukVq0AHHxzduhWeeSbqSkRE0it2gT5uHHTrpsFREck9sQv0Fi28L33RIti0KepqRETSJ3aBDj7b5eBB70sXEckVsQz0Pn1g+HBt2CUiuSWWgQ4+OLp+Pbz0UtSViIikR2wD/cILoX17DY6KSO6IbaC3aQOTJ8Mjj8DOnVFXIyKSerENdPBul6++8l0YRUTiLtaBfsYZcNpp8OCDUVciIpJ6sQ50M2+lr1gBb70VdTUiIqlVZ6CbWYGZLTWzNWa22syuSVy/1cw+M7O3Eq/xqS+34b7/fWjZUoOjIhJ/9WmhHwCuDyGcAgwBrjKz/omf3RtCKEy8FqWsyibo2BEmToS5c70/XUQkruoM9BDC5hDCisT3u4E1QPdUF5ZMxcXw+efw+ONRVyIikjoN6kM3s57AIGB54tKPzGylmU03s2OTXFvSjBoFPXtqcFRE4q3egW5mRwOPAteGEHYBDwAnAYXAZuDuI/zelWZWamalZWVlSSi54Zo18/1dXngBNmyIpAQRkZSrV6CbWR4e5nNDCI8BhBC2hhDKQwgHgd8Dg2v63RDCtBBCUQihKD8/P1l1N9hll/mslxkzIitBRCSl6jPLxYCHgDUhhHuqXO9W5baJwKrkl5c8BQUwdizMnAnl5VFXIyKSfPVpoQ8DLgFGHTJF8ddm9o6ZrQRGAtelstBkKC6GTz+F556LuhIRkeRrUdcNIYRlgNXwo4ycplib88+HTp18cHTcuKirERFJrlivFD3UUUfBpZfCk0/Ctm1RVyMiklw5Fejg3S4HDsCcOVFXIiKSXDkX6P37w5AhOs1IROIn5wIdvJW+Zg28/nrUlYiIJE9OBvp3vwtt22rlqIjES04Gert2Hurz58Pu3VFXIyKSHDkZ6ODdLnv3woIFUVciIpIcORvoQ4dCv37aJ11E4iNnA73iNKPXXoN33426GhGRpsvZQAdfZNSihVrpIhIPOR3onTv7dgCzZ8P+/VFXIyLSNDkd6ODdLtu3w1NPRV2JiEjT5Hygjx0L3bur20VEsl/OB3rz5nD55bB4MXzySdTViIg0Xs4HOnigh+CHX4iIZCsFOtCrlx8kPX06HDwYdTUiIo2jQE8oLoYPP4SlS6OuRESkcRToCRMnQocOGhwVkeylQE9o3Rouvhgeewx27Ii6GhGRhlOgV1FcDPv2wdy5UVciItJwCvQqCgvh9NN1mpGIZCcF+iGKi+Htt2HFiqgrERFpGAX6ISZPhlatNDgqItlHgX6IDh3g29/2fvQvvoi6GhGR+lOg16C4GHbtgkcfjboSEZH6U6DXYPhwOOkkdbuISHZRoNfADK64Al56CT74IOpqRETqR4F+BFOmQLNmvr+LiEg2UKAfQffuMH6878B44EDU1YiI1E2BXoviYti8GZ59NupKRETqpkCvxbnnQpcuGhwVkeygQK9FXh5ceik8/TRs2RJ1NSIitasz0M2swMyWmtkaM1ttZtckrnc0sxIzW5f4emzqy02/K66A8nKYPTvqSkREalefFvoB4PoQwinAEOAqM+sP3AQsCSH0AZYk3sdOv35w5pnasEtEMl+dgR5C2BxCWJH4fjewBugOXADMStw2C5iQqiKjVlwM778Py5ZFXYmIyJE1qA/dzHoCg4DlQJcQwmbw0Ac6J7u4TDFpErRrp8FREcls9Q50MzsaeBS4NoSwqwG/d6WZlZpZaVlZWWNqjFzbtnDRRfDHP8Jf/hJ1NSIiNatXoJtZHh7mc0MIjyUubzWzbomfdwO21fS7IYRpIYSiEEJRfn5+MmqORHGx77748MNRVyIiUrP6zHIx4CFgTQjhnio/ehKYkvh+CrAw+eVljsGDYeBAmDoVVq+OuhoRkcPVp4U+DLgEGGVmbyVe44E7gTFmtg4Yk3gfW2Ywb57v7zJ8OLz5ZtQViYhU16KuG0IIywA7wo9HJ7eczNa/P7zyCoweDaNGwaJFMGxY1FWJiDitFG2gk07yUO/aFc4+G55/PuqKREScAr0RCgrg5ZehVy/f7+Wpp6KuSEREgd5oXbrAiy/6QOk//APMnx91RSKS6xToTXDccbBkCQwdCt/7ng7DEJFoKdCbqH17WLwYxozxueq//W3UFYlIrlKgJ0GbNvDkkzBhAlx9Nfzyl1FXJCK5SIGeJC1bwoIFMHky3Hwz/Ou/andGEUmvOuehS/3l5fm+6W3bwi9+AXv2wL33+mIkEZFUU6AnWfPm8LvfwdFHe5jv2QPTpvl1EZFUUqCngBncfbdvuXv77bB3L8yZ4y14EZFUUaCniBncdpt3v9x4o+/UuGABtGoVdWUiElfq3U2xG26A++/31aTf+pa31kVEUkGBngY//CHMnAkvvABjx+qQDBFJDQV6mkyZ4odjLF/uuzVu3x51RSISNwr0NJo0CZ54AlatghEjYPPmqCsSkThRoKfZuefCs8/Chx/CWWfBRx9FXZGIxIUCPQIjR0JJCZSVwTe+AevWRV2RiMSBAj0iQ4fC0qXw5ZfeUl+1KuqKRCTbKdAjNGiQH5Shc0pFJBkU6BE75RQ/0q59ez+ndNmyqCsSkWylQM8AvXp5S71rV5+nrnNKRaQxFOgZouKc0t69fSbMk09GXZGIZBsFegbp0sUHSgsL/ZzShx+OuiIRySYK9AzTsaN3uQwb5odl6JxSEakvBXoGatfOFx+NHevnlP7nf0ZdkYhkAwV6hmrTxrcJmDgRrrnGT0ASEamNAj2DVZxTevHFfkbpzTfrnFIROTIdcJHhWrSAWbP8oIxf/tKPtLvvPp1TKiKHU6BngWbN4IEHPNTvucdD/fe/1zmlIlKd2nlZwgzuugumToUZM+C003yw9PPPo65MRDKFAj2LmMGtt8LcudC6tQ+Wfu1rcOmlvn2A+tdFcludgW5m081sm5mtqnLtVjP7zMzeSrzGp7ZMqWryZHjjDVixAi6/3GfDnHUW9O8P994Lf/5z1BWKSBTq00KfCZxTw/V7QwiFidei5JYl9TFoEPzXf/nJR9OnQ4cO8OMfe6t98mR48UW12kVySZ2BHkJ4GdiRhlqkkdq29Zb6a6/BypXwgx/4wqSRI6FvX/jNb2DbtqirFJFUa0of+o/MbGWiS+bYpFUkTXLqqT5YumkTzJ7t+8PccAP06AHf+Y5vK3DwYNRVikgqNDbQHwBOAgqBzcDdR7rRzK40s1IzKy0rK2vkx0lDtW4Nl1zig6WrV8NVV8GSJTBmDPTp43Pat2yJukoRSaZGBXoIYWsIoTyEcBD4PTC4lnunhRCKQghF+fn5ja1TmqBisPSzz3yGzPHH+6rTggK48EL4n/9Rq10kDhoV6GbWrcrbiYBOxMwCrVr5YOnSpfDee3Dttb4H+znn+CEb//7vHvoikp3qM21xHvAa0NfMPjWzYuDXZvaOma0ERgLXpbhOSbKKwdJPP4X5870b5mc/gxNOgAsugGeegfLyqKsUkYawkMZ5bUVFRaG0tDRtnycNs349PPigr0TdutW7ZK64wrfwLSiIujqR3GVmb4YQiuq6TytF5W9OOskHSz/5BB591Pveb78devaE886DhQvhwIGoqxSRI1Ggy2Hy8vwIvMWLYcMGH0BdsQImTPAumZ/9DD78MOoqReRQCnSpVc+ecMcd8PHH3kIfNMgP2+jVywdTX3gh6gpFpIICXeqlRQs4/3x4+mlvnU+dCu+8A6NHw6hR8OqrUVcoIgp0abCCAg/09evhP/4D3n0XzjwTxo3zTcNEJBoKdGm0Vq3g6qs92H/9aw/zwYN92uPbb0ddnUjuUaBLk7VtCz/5iQ+g3nEHvPQSFBbCd78La9ZEXZ1I7lCgS9K0bw+33AIbN/rXRYtgwAA/gGP9+qirE4k/Bbok3bHHekt940a4/np45BFfmfpP/+SzZUQkNRTokjKdOnnf+vr18MMf+na+ffrAj37k2/uKSHIp0CXlunXzPdo/+AAuuwx+9ztflXr99Tp4QySZFOiSNgUFHuZr1/qA6X33+QKlm2+GHToTS6TJFOiSdr16wcyZPn/9W9+CO++EE0+E226DXbuirk4keynQJTJ9+8K8eT5nffRouPVWD/Zf/Qr27o26OpHso0CXyJ16Kjz2GJSWwpAhcNNN3oq/7z746quoqxPJHgp0yRhnnOEHa/zv/3rIX3edD54+8ADs3x91dSKZT4EuGWfoUHj+eT8qr1cvn/J48skwfbr2YxepjQJdMtaIEX7m6eLF0Lmzn5x0yil+0LWOxxM5nAJdMpoZjB0Ly5f7fuxt28LFF8PAgb4C9eDBqCsUyRwKdMkKZr4f+4oVsGCBB/mkSd7v/tRTkMajcUUylgJdskqzZh7kq1bBnDmwe7cH/ZAh8NxzCnbJbQp0yUrNm3vXy5o18OCDsGWLd80MH+7b94rkIgW6ZLW8PB8sff99uP9+3y9mxAgYMwZefz3q6kTSS4EusdCypU9vXL8e7rnHV58OHQrnnef97iK5QIEusdK6tS9I2rABfvELX6R0xhlw4YXe7y4SZwp0iaWjj4af/tQP2Zg61RcqDRwIkyd794xIHCnQJdaOOcY3/dq40feIWbjQFyddfrlfE4kTBbrkhI4dvQtm40a45hrf5fHkk+Gf/xk+/TTq6kSSQ4EuOaVzZx80Xb8errzS94fp3dtDfsuWqKsTaRoFuuSk7t19muO6dT6f/f77fSOwG2+E7dujrk6kcRToktNOOMEXJr33ns+E+c1v/JCNf/s32Lkz6upEGkaBLoJ3u8yZ41Mbx42DO+7wYP/5z317AZFsUGegm9l0M9tmZquqXOtoZiVmti7x9djUlimSHv37++Zff/oTnHUW3HKLd8XcdRd88UXU1YnUrj4t9JnAOYdcuwlYEkLoAyxJvBeJjcJCn+K4fLkvTPrJT/z0pN/+Fvbti7o6kZrVGeghhJeBHYdcvgCYlfh+FjAhyXWJZITBg/2AjVde8UOtr77au2emTYO//jXq6kSqa2wfepcQwmaAxNfOyStJJPOceaYfiff881BQAD/4AfTrB7Nm6Vg8yRwpHxQ1syvNrNTMSsvKylL9cSIpYwajR8Orr/ph1h06wGWXwYAB8PDDOj1JotfYQN9qZt0AEl+3HenGEMK0EEJRCKEoPz+/kR8nkjnMYPx4KC2Fxx7zLXy/9z3fUuCmm3w/dnXHSBQaG+hPAlMS308BFianHJHsYQYTJ/pWvfPmwde+Bnff7fuxd+rkJyvNmAGbN0ddqeQKC3Wc2WVm84ARQCdgKzAVeAJYABwPfAxMCiEcOnB6mKKiolBaWtrEkkUy165dsGQJLFrkr02b/PqgQd6qHz8e/v7v/cQlkfoyszdDCEV13ldXoCeTAl1ySQiwcqUH+7PP+t7s5eW+UdjYsb6A6ZxzQD2RUhcFukiG+fxzKCnxgF+8GLZu9W6br3/dW+7jxkFRkR+ELVKVAl0kgx086KtRK7pmli/3Fn1+vrfax4+Hs8/21ryIAl0ki2zfDs89V9l6//OfvaU+ZEhl33thobfoJfco0EWyVHk5vPGG97svWuTTIwG6dfNumXHjYMwYP41JcoMCXSQmtm71VvuiRd6K37kTWrSAYcMq+94HDFDrPc4U6CIxdOAAvP56Zd/722/79R49KrtmRo/2Q7IlPhToIjngs88qW+8lJb53e16eb/1bEfB9+6r1nu0U6CI5Zv9+n+te0Xpfvdqvn3hiZdfMyJHQpk20dUrDKdBFctxHH/nA6rPP+i6RX3wBLVt6qFcEfO/eUVcp9aFAF5G/+eor39O9YtXq2rV+vU+fyq6Zs86CVq2irVNqpkAXkSNav75yWuTSpR74bdrAqFGVrfeePaOuUioo0EWkXr78El580cP9mWdg40a/3r+/B/v48X7Ax1FHRVpmTlOgi0iDhQDvv1/ZNfPSSz7YevTRvpipYmFTjx5RV5pbFOgi0mR79sALL1TOnPnkE78+cGBl3/vQob7QSVJHgS4iSRUCvPtuZbgvW+YLnY45xjcSGz/eNxbr2jXqSuNHgS4iKbVrl0+HrAj4ipOZTj/dw33YMD9I+/jjtSVwUynQRSRtqh7msWiRL3CqODS7VStfrdqvX+XXfv3g5JOhbdto684WCnQRiczOnfDOO/Dee9VfGzd6+Fc4/vjDg75fP99ZUtsVVKpvoGsoQ0SSrkMH+MY3/FXVV1/BBx8cHvTTp8PevZX3tWtXGe5Vw753b1/tKjVToItI2rRq5Vv9DhhQ/XoIvtHY2rXVg37pUpgzp/K+Zs2gV6+aw75Tp/T+v2QiBbqIRM7M57b36OHb/1a1Z4/PjT+0VV9SAvv2Vd533HE1B/2JJ+bOtEr1oYtIViov9w3I3nvv8Jb9tm2V9+Xl+Z41h4Z9377Zc+qT+tBFJNaaN/ful169fJpkVTt2eMhXDfrVq2HhQv+HoEK3btUHYyvCvqAgO6daKtBFJHY6dvQVrEOHVr++fz9s2HB4q/4Pf4C//KXyvtatD59507evT7XM5P3kFegikjOOOqoyoKsKwbtpKgK+IuyXL4f586tPtTzhhJpb9V27Rj/VUoEuIjnPDLp08dfw4dV/9uWXsG7d4WH/yit+aEiF9u2rB31F2Pfunb6dKhXoIiK1aN3aNyMbOLD69YMHfapl1cHYtWthyRKYPbvyvoq+/mnTYMSI1NaqQBcRaYRmzXzwtKDAtxauavfu6n30a9dC586pr0mBLiKSZO3aQVGRv9IpCyfmiIhITRToIiIxoUAXEYmJJvWhm9mHwG6gHDhQn6WpIiKSGskYFB0ZQtiehP+OiIg0gbpcRERioqmBHoDnzOxNM7uyphvM7EozKzWz0rKysiZ+nIiIHElTA31YCOF0YBxwlZmddegNIYRpIYSiEEJRfn5+Ez9ORESOJGn7oZvZrcCeEMJdtdxTBnyUlA+MTidAYwaV9Dwq6VlUp+dRXVOexwkhhDpbxI0eFDWztkCzEMLuxPdnA7fX9jv1KSjTmVmpZvNU0vOopGdRnZ5Hdel4Hk2Z5dIFeNx8v8gWwB9CCIuTUpWIiDRYowM9hLABOC2JtYiISBNo2mLDTYu6gAyj51FJz6I6PY/qUv480npItIiIpI5a6CIiMaFAr4WZTTezbWa2qsq1jmZWYmbrEl+PjbLGdDGzAjNbamZrzGy1mV2TuJ6rz6OVmf2fmb2deB63Ja6faGbLE89jvpml6fCx6JlZczP7k5k9nXify8/iQzN7x8zeMrPSxLWU/60o0Gs3EzjnkGs3AUtCCH2AJYn3ueAAcH0I4RRgCL6QrD+5+zz2AaNCCKcBhcA5ZjYE+BVwb+J5fA4UR1hjul0DrKnyPpefBfg+V4VVpiqm/G9FgV6LEMLLwI5DLl8AzEp8PwuYkNaiIhJC2BxCWJH4fjf+h9ud3H0eIYSwJ/E2L/EKwCjgkcT1nHkeZtYDOBd4MPHeyNFnUYuU/60o0BuuSwhhM3jIAWk4KTCzmFlPYBCwnBx+HokuhreAbUAJsB7YGUI4kLjlU/wfvVxwH3ADcDDx/jhy91lAzftcpfxvRWeKSoOY2dHAo8C1IYRdiYVlOSmEUA4UmlkH4HHglJpuS29V6Wdm5wHbQghvmtmIiss13Br7Z1HFsBDCJjPrDJSY2Xvp+FC10Btuq5l1A0h83RZxPWljZnl4mM8NITyWuJyzz6NCCGEn8CI+ttDBzCoaSj2ATVHVlUbDgPMTB948jHe13EduPgsAQgibEl+34f/YDyYNfysK9IZ7EpiS+H4KsDDCWtIm0Sf6ELAmhHBPlR/l6vPIT7TMMbPWwDfxcYWlwLcTt+XE8wgh/DSE0COE0BO4CHghhPB9cvBZgO9zZWbtKr7H97laRRr+VrSwqBZmNg8Yge+SthWYCjwBLACOBz4GJoUQDh04jR0zOxN4BXiHyn7Sm/F+9Fx8HgPxga3meMNoQQjhdjPrhbdSOwJ/Ai4OIeyLrtL0SnS5/EsI4bxcfRaJ/+/HE28r9rn6uZkdR4r/VhToIiIxoS4XEZGYUKCLiMSEAl1EJCYU6CIiMaFAFxGJCQW6iEhMKNBFRGJCgS4iEhP/D7s+7riu2VE2AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同K对应的聚类的性能，找到最佳模型／参数（分数最高）\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-',label = 'CH_scores')\n",
    "\n",
    "\n",
    "### 最佳超参数\n",
    "index = np.unravel_index(np.argmax(CH_scores, axis=None), len(CH_scores))\n",
    "Best_K = Ks[ index[0]]\n",
    "\n",
    "print(Best_K)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 用最佳的K再次聚类，得到聚类结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "mb_kmeans = MiniBatchKMeans(n_clusters = Best_K)\n",
    "\n",
    "y_pred = mb_kmeans.fit_predict(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([4, 3, 0, ..., 3, 3, 3])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 保存聚类的结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存聚类结果\n",
    "feat_names_Kmeans = \"Kmeans_\" + str(Best_K)\n",
    "\n",
    "y = pd.Series(data = y, name = 'target')\n",
    "train_kmeans = pd.concat([pd.Series(name = feat_names_Kmeans, data = y_pred), y], axis = 1)\n",
    "train_kmeans.to_csv( 'company_classification_train_KMeans.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 保存KMeans模型，用于后续对测试数据的聚类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import _pickle as cPickle\n",
    "\n",
    "cPickle.dump(mb_kmeans, open(\"mb_kmeans.pkl\", 'wb'))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
